Click to show code
# Identify values causing the issue
problematic_values <- properties$number_of_rooms[is.na(as.numeric(properties$number_of_rooms))]
#> Warning: NAs introduced by coercion
# Replace non-numeric values with NA
#properties$number_of_rooms <- as.numeric(gsub("[^0-9.]", "", properties$number_of_rooms))
# Remove non-numeric characters and convert to numeric
properties$price <- as.numeric(gsub("[^0-9]", "", properties$price))
# Subset the dataset to exclude rows with price < 20000
properties <- properties[properties$price >= 20000, ]
# Subset the dataset to exclude rows with numbers of rooms < 25
#properties <- properties[properties$number_of_rooms <25, ]
# Replace incomplete addresses
properties$address <- gsub("^\\W*[.,0-]\\W*", "", properties$address)
properties_filtered <- na.omit(properties)
properties_filtered$year_category <- substr(properties_filtered$year_category, 1, 9)
# Assuming 'year_category' is a column in the 'properties' dataset
properties_filtered$year_category <- as.factor(properties_filtered$year_category)
# remove m^2 from column 'square_meters'
properties_filtered$square_meters <- as.numeric(gsub("\\D", "", properties_filtered$square_meters))
# print how many NA observations left in square_meters
print(sum(is.na(properties_filtered$square_meters)))
#> [1] 1056
# remove NA
properties_filtered <- properties_filtered[!is.na(properties_filtered$square_meters),]
# add majuscule to canton
properties_filtered$canton <- tools::toTitleCase(properties_filtered$canton)
# # Preprocess the number_of_rooms column
properties_filtered$number_of_rooms <- gsub(" rooms", "", properties_filtered$number_of_rooms)
properties_filtered$number_of_rooms <- gsub(" room", "", properties_filtered$number_of_rooms)
# Remove rows with "m²" in the "number_of_rooms" column
properties_filtered <- properties_filtered[!grepl("m²", properties_filtered$number_of_rooms), ]
properties_filtered$number_of_rooms <- as.numeric(properties_filtered$number_of_rooms)
# Remove rows with rooms >= 100
properties_filtered <- properties_filtered[properties_filtered$number_of_rooms <= 100, , drop = FALSE]
# Divide cells by 10 if number of rooms is more than 27
properties_filtered$number_of_rooms <- ifelse(properties_filtered$number_of_rooms > 27,
properties_filtered$number_of_rooms / 10,
properties_filtered$number_of_rooms)
#remove row with weird number of rooms
properties_filtered <- properties_filtered[properties_filtered$number_of_rooms != 7.6, ]
# properties_filtered$number_of_rooms <- as.character(properties_filtered$number_of_rooms)
# properties_filtered$number_of_rooms <- gsub("\\D", properties_filtered$number_of_rooms) # Remove non-numeric characters
# properties_filtered$number_of_rooms <- as.numeric(properties_filtered$number_of_rooms) # Convert to numeric
# properties_filtered$number_of_rooms <- trunc(properties_filtered$number_of_rooms) # Truncate non-integer values